Two datasets were generated using the 10X Genomics Chromium 3’ scRNA-Seq platform:
| species | experiment_name | run_number | lane_number | sequencer | approximate_number_of_cells |
|---|---|---|---|---|---|
| pb | straight bleed experiment | 22252 | 5 | Hiseq 4000 | 30,000 |
| pb | 1:1 mix experiment | 24284 | 1 & 2 | Hiseq 2500 | 5,000 |
This script details the quality control of the 1:1 mix experiment
This data has been processed using CellRanger into counts tables. This initial analysis gave the following metrics:
Pb 1:1 mix experiment (run #: 24284 lanes 1 and 2 (Hiseq 2500)):
We will load this data in and for each run:
A. Define ‘cells’
B. Filter poor quality cells out
C. Dimensionality Reduction and Clustering
D. Remove Doublets
E. Predict life Cycle Stage (Using Bulk RNA-Seq Correlation)
[1] "Seurat is loaded correctly"
[1] "cowplot is loaded correctly"
[1] "gridExtra is loaded correctly"
[1] "grid is loaded correctly"
[1] "Hmisc is loaded correctly"
[1] "dplyr is loaded correctly"
[1] "scales is loaded correctly"
[1] "ggpubr is loaded correctly"
[1] "patchwork is loaded correctly"
This will be helpful later on. This contains annotations for each gene:
##Import gtf file:
gtf <- read.table("../data/Reference/Pberghei.gtf", sep="\t", header = FALSE)
head(gtf)
## read in 10x output
tenx5k_raw_data <- Read10X("../data/10X/tenx_24284")
## Create Seurat object
tenx5k <- CreateSeuratObject(counts = tenx5k_raw_data, min.cells = 0, min.features = 0, project = "GCSKO")
Feature names cannot have underscores ('_'), replacing with dashes ('-')
## add experiment to meta data for merging later
tenx5k@meta.data$experiment <- "tenx5k"
## inspect
tenx5k
An object of class Seurat
5098 features across 737280 samples within 1 assay
Active assay: RNA (5098 features, 0 variable features)
Plot a knee plot and then use a mixture model to define where the cells vs. background lie
## interesting reference material for this section can be found here: https://hemberg-lab.github.io/scRNA.seq.course/processing-raw-scrna-seq-data.html
## get the nUMIs
umi_per_barcode <- as.data.frame(tenx5k@meta.data$nCount_RNA)
## remove zeros as these have issues when you log them and make the model later:
umi_per_barcode <- as.data.frame(umi_per_barcode[!(umi_per_barcode$`tenx5k@meta.data$nCount_RNA`==0), ])
## get a rank for each barcode
barcode_rank <- rank(-umi_per_barcode[,1])
## then make into a list
lib_size <- (umi_per_barcode[,1])
## then log this
log_lib_size <- log10(umi_per_barcode[,1])
##plot
#plot(barcode_rank, log_lib_size, xlim=c(1,100000))
## order the barcode ranks
o <- order(barcode_rank)
## reorder the library size, barcode rank by their rank
log_lib_size <- log_lib_size[o]
barcode_rank <- barcode_rank[o]
lib_size <- lib_size[o]
make a mixture model to determine the knee of the plot
## set a seed for the mixture model
set.seed(-92497)
## mixture model calculation
require("mixtools")
mix <- normalmixEM(log_lib_size)
One of the variances is going to zero; trying new starting values.
One of the variances is going to zero; trying new starting values.
number of iterations= 17
## plot result
plot(mix, which=2, xlab2="log(mol per cell)")
Find where the distributions intersect (i.e. where cells vs. background is)
## identify where the split between the distributions is
p1 <- dnorm(log_lib_size, mean=mix$mu[1], sd=mix$sigma[1])
p2 <- dnorm(log_lib_size, mean=mix$mu[2], sd=mix$sigma[2])
if (mix$mu[1] < mix$mu[2]) {
split <- min(log_lib_size[p2 > p1])
} else {
split <- min(log_lib_size[p1 > p2])
}
## print split
split
[1] 1.724276
View the initial result
## log the barcode rank
log_barcode_rank <- log10(barcode_rank)
## plot
plot(log_barcode_rank, log_lib_size, xlim=c(1,6))
## add the split as a line on the plot
abline(h=split, col="red")
Final Figures:
## make the results of the above functions into a dataframe
df_barcodes <- as.data.frame(cbind(barcode_rank, log_lib_size, lib_size), row.names = NULL)
## add a column for if it is a cell or not
df_barcodes$cell = rownames(df_barcodes) %in% which(df_barcodes$log_lib_size > split)
## change value to a numeric
df_barcodes$cell <- as.numeric(df_barcodes$cell)
## change the 0 to a 2 for ease of handling
df_barcodes$cell[df_barcodes$cell<1] <- 2
## rename the numerics into cells or background
df_barcodes$cell[df_barcodes$cell == 1] <- "Cells"
df_barcodes$cell[df_barcodes$cell == 2] <- "Background"
## extract the cutoff for cells do you can plot the lines
boundary <- as.numeric(sum(df_barcodes$cell == "Cells"))
split <- 10^split
## make the plot
barcode_plot <- ggplot(df_barcodes, aes(x=barcode_rank, y=lib_size, colour = cell, theme_size = 40)) +
## make into a dot plot
geom_point(size = 1, shape = 16) +
## make the axis into log and specify breaks
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
## make the axis into log and specify breaks
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
annotation_logticks() +
## change colours of plot
scale_color_manual(values=c("#bdbdbd", "#5ba43a"), labels = c("Background", "Cells")) +
## change aes of legend
theme(legend.position="bottom", text = element_text(size=25), legend.text=element_text(size=25), axis.text=element_text(size=25)) +
## add the lines on the plot
geom_segment(aes(x = 0, y = split, xend = boundary, yend = split), colour = "black", alpha = 0.01) +
geom_segment(aes(x = boundary, y = 0, xend = boundary, yend = split), colour = "black", alpha = 0.01) +
## change the axis labels
labs(x = "Barcodes", y = "UMI Counts", colour="Cell Designation") +
## change the size of the legend
guides(colour = guide_legend(override.aes = list(size=10))) +
## fix axis
coord_fixed() +
## make it look pretty
theme_light()
## print the plot
barcode_plot
## save the plot
ggsave("barcode_plot_5k.png", plot = barcode_plot, device = "png", height = 15, width = 15, units = "cm", path = "../images_to_export/")
so the number of cells that is retained is:
Background Cells
138879 7762
## extract the nCount and row names from the Seurat object
upb <- data.frame(nCount_RNA = tenx5k@meta.data$nCount_RNA, row.names = rownames(tenx5k@meta.data))
## make blank column for rank
upb$rank <- NA
## order by nCounts
order.scores <- order(upb$nCount_RNA, decreasing = TRUE)
## add a rank to this column based on the ordered nCount
upb$rank[order.scores] <- 1:nrow(upb)
## inspect
#head(upb)
## make a list of cells to retain in the Seurat object
keep_cells <- rownames(upb[which(upb$rank < 7763),])
## subset Seurat object to include cells and discard background
pb_sex <- subset(tenx5k, cells = keep_cells)
Mitochondrial cell counts
## extract mitochondrial genes
#mito_genes <- gtf[which(gtf$V3 == "rRNA"),]$V9
#mito_genes <- gsub(";.*","", gsub("gene_id ", "", mito_genes))
#paste("These are the mitochondrial genes")
#head(mito_genes)
## extract mito genes
mito_genes <- pb_sex@assays$RNA@counts@Dimnames[[1]][grep("^PBANKA-MIT",pb_sex@assays$RNA@counts@Dimnames[[1]])]
## plot the genes individually
VlnPlot(object = pb_sex, features = mito_genes, pt.size = 0.01)
All cells have the same value of PBANKA-MIT0010.All cells have the same value of PBANKA-MIT0020.All cells have the same value of PBANKA-MIT0030.All cells have the same value of PBANKA-MIT0040.All cells have the same value of PBANKA-MIT0050.All cells have the same value of PBANKA-MIT0090.All cells have the same value of PBANKA-MIT0130.All cells have the same value of PBANKA-MIT0140.All cells have the same value of PBANKA-MIT0150.All cells have the same value of PBANKA-MIT0160.All cells have the same value of PBANKA-MIT0200.All cells have the same value of PBANKA-MIT0210.All cells have the same value of PBANKA-MIT0230.All cells have the same value of PBANKA-MIT0240.All cells have the same value of PBANKA-MIT0250.All cells have the same value of PBANKA-MIT0260.All cells have the same value of PBANKA-MIT0290.All cells have the same value of PBANKA-MIT0300.All cells have the same value of PBANKA-MIT0320.All cells have the same value of PBANKA-MIT0330.All cells have the same value of PBANKA-MIT0340.All cells have the same value of PBANKA-MIT0370.
## make a percentage mitocondrial for each cell (this will work as long as you filter cells out with zero counts)
pb_sex <- PercentageFeatureSet(pb_sex, pattern = "^PBANKA-MIT", col.name = "percent.mt")
plot percentage mitochondrial
## plot for percentage of mitochondrial reads
v1 <- VlnPlot(object = pb_sex, features = "percent.mt", pt.size = 0.01) +
## add a line where we will filter
geom_abline(intercept = 20, col="blue") +
## change labels
labs(x = "",y = "% Mitochondrial Reads", title = "Mitochondrial per cell") +
## remove legend
theme(legend.position = "none") +
## change appearance
theme_classic() +
scale_fill_manual(values="grey") +
#scale_y_continuous(limits = c(0, 100)) +
theme(legend.position="none", axis.text.x = element_blank(), axis.ticks.x=element_blank(), text = element_text(size=20), legend.text=element_text(size=20), axis.text=element_text(size=20), axis.text.y=element_text(colour="black"))
## print
v1
## save
#ggsave("~/images_to_export/QC_10X_mito_violin.png", plot = QC_mito_violin, device = "png", path = NULL, scale = 1, width = 15, height = 10, units = "cm", dpi = 300, limitsize = TRUE)
In the Smart-seq2 data, we use a threshold of 20%. No cell in our 10X data is higher than 13% and only
[1] 10
cells have a % mitochondrial reads above 5%.
plot individual violin plots
## nGenes plot
gene_plot_5k <- VlnPlot(object = pb_sex, features = "nFeature_RNA", pt.size = 0.01)
## improve the aesthetics
gene_plot_5k <- gene_plot_5k +
geom_abline(intercept = 200, col="blue") +
labs(x = "",y = "nGene", title = "Genes per cell") +
theme_classic() +
scale_fill_manual(values="grey") +
scale_y_continuous(limits = c(0, 3000)) +
theme(legend.position="none", axis.text.x = element_blank(), axis.ticks.x=element_blank(), text = element_text(size=20), legend.text=element_text(size=20), axis.text=element_text(size=20), axis.text.y=element_text(colour="black"))
Scale for 'y' is already present. Adding another scale for 'y', which will replace the
existing scale.
## nUMI plot
numi_plot_5k <- VlnPlot(object = pb_sex, features = "nCount_RNA", pt.size = 0.01)
## improve aesthetics
numi_plot_5k <- numi_plot_5k +
labs(x = "",y = "nUMI", title = "UMIs per cell") +
theme_classic() +
scale_fill_manual(values="grey") +
scale_y_continuous(limits = c(0, 3000)) +
theme(legend.position="none", axis.text.x = element_blank(), axis.ticks.x=element_blank(), text = element_text(size=20), legend.text=element_text(size=20), axis.text=element_text(size=20), axis.text.y=element_text(colour="black"))
Scale for 'y' is already present. Adding another scale for 'y', which will replace the
existing scale.
## plot together
grid.arrange(gene_plot_5k, numi_plot_5k, ncol = 2, top=textGrob("5K cells 10X", gp=gpar(fontsize=15,font=8)))
## save nGene plot on its own
#ggsave("ngene_plot.pdf", plot = gene1, device = "pdf", height = 5, width = 5, units = "in", path = "/Users/ar19/Desktop/PhD/GCSKO_Analysis")
plot two metrics together
## make a dataframe for important filtering metrics
df <- data.frame(nCount = log10(pb_sex@meta.data$nCount_RNA), nGenes = pb_sex@meta.data$nFeature_RNA, percent_mt = pb_sex@meta.data$percent.mt, experiment = pb_sex@meta.data$experiment)
## plot main dotplot
plot1 <- ggplot(df, aes(x = nCount, y = nGenes)) +
geom_point(aes(), size = 0.1) +
geom_rug() +
scale_y_continuous(name = "Number of Detected Genes") +
scale_x_continuous(name = "log10(Number of Total UMI)") +
theme_pubr() +
theme(legend.position = "bottom") +
geom_hline(yintercept=200)
## plot density plot x
dens1 <- ggplot(df, aes(x = nCount)) +
geom_density(alpha = 0.2) +
theme_void() +
theme(legend.position = "none")
## plot density plot y
## old method
# dens2 <- ggplot(df, aes(x = nGenes, y = experiment)) +
# geom_density(alpha = 0.2) +
# theme_void() +
# theme(legend.position = "none") +
# coord_flip()
## new method
dens <- density(df$nGenes)
dt <- data.frame(x=dens$x, y=dens$y)
probs <- c(0, 0.25, 0.5, 0.75, 1)
quantiles <- quantile(df$nGenes, prob=probs)
dt$quant <- factor(findInterval(dt$x,quantiles))
dens2 <- ggplot(dt, aes(x,y)) + geom_line() + geom_ribbon(aes(ymin=0, ymax=y, fill=quant)) + scale_x_continuous(breaks=quantiles) + scale_fill_brewer(guide="none") + theme_void() + theme(legend.position = "none") +
coord_flip()
## plot together
QC_composite_plot <- dens1 + plot_spacer() + plot1 + dens2 + plot_layout(ncol = 2, nrow = 2, widths = c(4, 1), heights = c(1, 4))
## print
QC_composite_plot
## save
ggsave("../images_to_export/QC_10X_composite_plot.png", plot = QC_composite_plot, device = "png", path = NULL, scale = 1, width = 20, height = 20, units = "cm", dpi = 300, limitsize = TRUE)
The threshold used in the malaria cell atlas was 230 for Pb but this is dependent on sequencing depth etc. We can plot the number of cells recovered for a range of thresholds:
paste("original number of cells =", nrow(pb_sex@meta.data))
[1] "original number of cells = 7762"
paste("with >150 filter =", nrow(pb_sex@meta.data[pb_sex@meta.data$nFeature_RNA > 150, ]))
[1] "with >150 filter = 7188"
paste("with >200 filter =", nrow(pb_sex@meta.data[pb_sex@meta.data$nFeature_RNA > 200, ]))
[1] "with >200 filter = 6631"
paste("with >230 filter =", nrow(pb_sex@meta.data[pb_sex@meta.data$nFeature_RNA > 230, ]))
[1] "with >230 filter = 6239"
Since we have already filtered on nUMI, we will filter with 200.
## number of cells before filtering
pb_sex_pre_filter_nCells <- nrow(pb_sex@meta.data)
## filter object
pb_sex <- subset(pb_sex, subset = nFeature_RNA > 200)
## number of cells after filtering
pb_sex_post_filter_nCells <- nrow(pb_sex@meta.data)
## print results of filtering
paste("number of cells pre-filter", pb_sex_pre_filter_nCells)
[1] "number of cells pre-filter 7762"
paste("number of cells post-filter", pb_sex_post_filter_nCells)
[1] "number of cells post-filter 6631"
paste((pb_sex_pre_filter_nCells - pb_sex_post_filter_nCells), "cells were removed by filtering on number of genes.")
[1] "1131 cells were removed by filtering on number of genes."
## normalise object
pb_sex <- NormalizeData(pb_sex, normalization.method = "LogNormalize", scale.factor = 10000)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
## find variable genes
pb_sex <- FindVariableFeatures(pb_sex, selection.method = "vst", nfeatures = 2000)
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
## scale the data
all.genes <- rownames(pb_sex)
pb_sex <- ScaleData(pb_sex, features = all.genes)
Centering and scaling data matrix
|
| | 0%
|
|============== | 17%
|
|============================ | 33%
|
|========================================== | 50%
|
|========================================================= | 67%
|
|======================================================================= | 83%
|
|=====================================================================================| 100%
## run PCA
pb_sex <- RunPCA(pb_sex, features = VariableFeatures(object = pb_sex))
PC_ 1
Positive: PBANKA-1235600, PBANKA-0931300, PBANKA-1214300, PBANKA-1400400, PBANKA-1101100, PBANKA-0205000, PBANKA-0311800, PBANKA-0823400, PBANKA-1242300, PBANKA-1246600
PBANKA-0722600, PBANKA-1008500, PBANKA-1308600, PBANKA-0932400, PBANKA-1314800, PBANKA-1300096, PBANKA-1309900, PBANKA-1200096, PBANKA-1145400, PBANKA-0722801
PBANKA-1100441, PBANKA-1040521, PBANKA-1326400, PBANKA-1101300, PBANKA-0941300, PBANKA-1210200, PBANKA-1303800, PBANKA-0216061, PBANKA-0406500, PBANKA-1127000
Negative: PBANKA-1312700, PBANKA-1224900, PBANKA-0824400, PBANKA-1453000, PBANKA-1115200, PBANKA-1432200, PBANKA-1204200, PBANKA-0703500, PBANKA-0702000, PBANKA-0812600
PBANKA-0312700, PBANKA-1128800, PBANKA-1218300, PBANKA-1106500, PBANKA-1430600, PBANKA-1449300, PBANKA-0209300, PBANKA-1232600, PBANKA-1432400, PBANKA-1421500
PBANKA-1320800, PBANKA-0620400, PBANKA-1038800, PBANKA-1110700, PBANKA-1336200, PBANKA-0417600, PBANKA-1451100, PBANKA-1143800, PBANKA-1414500, PBANKA-0402700
PC_ 2
Positive: PBANKA-1431500, PBANKA-1431400, PBANKA-0942400, PBANKA-1340400, PBANKA-0934700, PBANKA-1038700, PBANKA-0942300, PBANKA-1449000, PBANKA-1208800, PBANKA-0828900
PBANKA-0911000, PBANKA-0306100, PBANKA-0927700, PBANKA-0806800, PBANKA-0416100, PBANKA-0507800, PBANKA-1409600, PBANKA-0411300, PBANKA-0406200, PBANKA-0415800
PBANKA-1033700, PBANKA-1361500, PBANKA-0925400, PBANKA-0615200, PBANKA-1108800, PBANKA-0507300, PBANKA-0720100, PBANKA-0920700, PBANKA-1030400, PBANKA-0510600
Negative: PBANKA-1317200, PBANKA-1352500, PBANKA-0810700, PBANKA-1225500, PBANKA-1319500, PBANKA-0402500, PBANKA-0827400, PBANKA-1419300, PBANKA-0714000, PBANKA-1315300
PBANKA-0105100, PBANKA-1436300, PBANKA-1311400, PBANKA-0417600, PBANKA-0821400, PBANKA-0417200, PBANKA-0204500, PBANKA-1342300, PBANKA-0704900, PBANKA-1415200
PBANKA-0605800, PBANKA-1115000, PBANKA-0517600, PBANKA-1329900, PBANKA-1134900, PBANKA-1231300, PBANKA-1233600, PBANKA-1035200, PBANKA-0912500, PBANKA-1363600
PC_ 3
Positive: PBANKA-1400600, PBANKA-1459300, PBANKA-0416000, PBANKA-0304800, PBANKA-0305100, PBANKA-1365200, PBANKA-1443300, PBANKA-0830200, PBANKA-0938400, PBANKA-0827200
PBANKA-1137800, PBANKA-1437300, PBANKA-0932200, PBANKA-0506100, PBANKA-1349000, PBANKA-0313800, PBANKA-1113300, PBANKA-0409800, PBANKA-1017100, PBANKA-1117000
PBANKA-0408500, PBANKA-1450300, PBANKA-0922500, PBANKA-0934300, PBANKA-0505200, PBANKA-0941800, PBANKA-0915200, PBANKA-0722921, PBANKA-0316600, PBANKA-0519400
Negative: PBANKA-1431500, PBANKA-1431400, PBANKA-1449000, PBANKA-0806800, PBANKA-0927700, PBANKA-0911000, PBANKA-1108000, PBANKA-0828900, PBANKA-0601200, PBANKA-1208800
PBANKA-1409600, PBANKA-0830900, PBANKA-0942400, PBANKA-1029400, PBANKA-1038700, PBANKA-0102700, PBANKA-1316500, PBANKA-1304500, PBANKA-0521200, PBANKA-1333100
PBANKA-0902400, PBANKA-1129600, PBANKA-1119800, PBANKA-0822900, PBANKA-1109600, PBANKA-0301800, PBANKA-1429100, PBANKA-0622000, PBANKA-1038200, PBANKA-0942300
PC_ 4
Positive: PBANKA-0107300, PBANKA-1214300, PBANKA-0814200, PBANKA-0713300, PBANKA-0604300, PBANKA-0405200, PBANKA-0109100, PBANKA-0932200, PBANKA-0823400, PBANKA-1460400
PBANKA-1448000, PBANKA-0416500, PBANKA-0938400, PBANKA-0406500, PBANKA-1302800, PBANKA-1130500, PBANKA-1437300, PBANKA-1444100, PBANKA-0211500, PBANKA-1419100
PBANKA-0710100, PBANKA-1223100, PBANKA-1235600, PBANKA-1450300, PBANKA-0919600, PBANKA-0916200, PBANKA-1242300, PBANKA-1326400, PBANKA-0313800, PBANKA-0818900
Negative: PBANKA-0519000, PBANKA-0519100, PBANKA-0519300, PBANKA-0831000, PBANKA-1349100, PBANKA-1344400, PBANKA-1002400, PBANKA-0713100, PBANKA-0523700, PBANKA-0111000
PBANKA-0519400, PBANKA-1349000, PBANKA-1032100, PBANKA-1014500, PBANKA-0932000, PBANKA-0804500, PBANKA-1315700, PBANKA-0501400, PBANKA-1425900, PBANKA-0519200
PBANKA-1101400, PBANKA-0509600, PBANKA-0619700, PBANKA-1035400, PBANKA-0417800, PBANKA-1210600, PBANKA-1327100, PBANKA-0523800, PBANKA-0307500, PBANKA-1117200
PC_ 5
Positive: PBANKA-0501600, PBANKA-1240600, PBANKA-1002600, PBANKA-0707700, PBANKA-0112200, PBANKA-1002400, PBANKA-0208900, PBANKA-0307500, PBANKA-0812200, PBANKA-1423000
PBANKA-0832800, PBANKA-1119600, PBANKA-1409200, PBANKA-0716900, PBANKA-0100400, PBANKA-1364400, PBANKA-1347000, PBANKA-0915000, PBANKA-0819600, PBANKA-1319000
PBANKA-1212500, PBANKA-1400091, PBANKA-0900900, PBANKA-1202000, PBANKA-1345900, PBANKA-0311500, PBANKA-1008600, PBANKA-0911700, PBANKA-1349100, PBANKA-0519200
Negative: PBANKA-1443300, PBANKA-0925600, PBANKA-0306700, PBANKA-1106500, PBANKA-0907200, PBANKA-0304800, PBANKA-1217400, PBANKA-1107600, PBANKA-1349000, PBANKA-1319300
PBANKA-1313100, PBANKA-0312500, PBANKA-0611700, PBANKA-1113300, PBANKA-0208000, PBANKA-1241800, PBANKA-0305100, PBANKA-1438000, PBANKA-1339300, PBANKA-0304400
PBANKA-0833300, PBANKA-0206300, PBANKA-0207500, PBANKA-0519400, PBANKA-0715300, PBANKA-1360000, PBANKA-0112100, PBANKA-1340500, PBANKA-0915200, PBANKA-1416800
## plot
DimPlot(pb_sex, reduction = "pca")
## elbow plot
ElbowPlot(pb_sex, ndims = 30, reduction = "pca")
## run UMAP
pb_sex <- RunUMAP(pb_sex, dims = 1:10, seed.use = 1234, n.neighbors = 150)
16:05:48 UMAP embedding parameters a = 0.9922 b = 1.112
16:05:48 Read 6631 rows and found 10 numeric columns
16:05:48 Using Annoy for neighbor search, n_neighbors = 150
16:05:48 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:05:51 Writing NN index file to temp file /var/folders/wj/rztzclxn1t10cl2sk0plbf3r0000gn/T//RtmpyuC0hm/fileef95491096fc
16:05:51 Searching Annoy index using 1 thread, search_k = 15000
16:06:03 Annoy recall = 100%
16:06:05 Commencing smooth kNN distance calibration using 1 thread
16:06:09 Initializing from normalized Laplacian + noise
16:06:11 Commencing optimization for 500 epochs, with 1163176 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:06:31 Optimization finished
## plot
DimPlot(pb_sex, reduction = "umap", group.by = "ident", label = TRUE)
## These are the parameters used in the merge UMAP
#pb_sex <- RunUMAP(pb_sex, reduction = "pca", dims = 1:10, n.neighbors = 150, seed.use = 1234, min.dist = 0.4, repulsion.strength = 0.03, local.connectivity = 150)
#DimPlot(pb_sex, reduction = "umap", group.by = "ident", label = TRUE)
colour with a few marker genes:
# PBANKA-0515000 - p25 - female
# PBANKA-1212600 - HAP2 - male
# PBANKA-0600600 - NEK3 - male
# PBANKA-0831000 - MSP1 - late asexual
# PBANKA-1315700 - RON2 - (asexuals and some male?)
# PBANKA-0416100 - dynenin heavy chain - male - used in 820 line
# PBANKA-1319500 - CCP2 - female - used in 820 line
# PBANKA-1437500 - AP2-G - seuxal commitment gene
# PBANKA-1102200 - MSP8 - early asexual (from Bozdech paper)
FeaturePlot(pb_sex, features = c("PBANKA-0515000", "PBANKA-1212600","PBANKA-0600600", "PBANKA-0831000", "PBANKA-1315700", "PBANKA-0416100", "PBANKA-1319500", "PBANKA-1437500", "PBANKA-1102200"))
pb_sex <- FindNeighbors(pb_sex, dims = 1:21)
Computing nearest neighbor graph
Computing SNN
pb_sex <- FindClusters(pb_sex, resolution = 1)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 6631
Number of edges: 277150
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8104
Number of communities: 14
Elapsed time: 1 seconds
DoubletFinder function
## DoubletFinder should be able to be installed and run as so:
#devtools::install_github('chris-mcginnis-ucsf/DoubletFinder')
#library(doubletFinder) #allows removal of doublets
## but there seems to be some problems with this so we will run it from the github code
## the doublet finder function
doubletFinder_v3 <- function(seu, PCs, pN = 0.25, pK, nExp, reuse.pANN = FALSE, sct = FALSE) {
require(Seurat); require(fields); require(KernSmooth)
## Generate new list of doublet classificatons from existing pANN vector to save time
if (reuse.pANN != FALSE ) {
pANN.old <- seu@meta.data[ , reuse.pANN]
classifications <- rep("Singlet", length(pANN.old))
classifications[order(pANN.old, decreasing=TRUE)[1:nExp]] <- "Doublet"
seu@meta.data[, paste("DF.classifications",pN,pK,nExp,sep="_")] <- classifications
return(seu)
}
if (reuse.pANN == FALSE) {
## Make merged real-artifical data
real.cells <- rownames(seu@meta.data)
data <- seu@assays$RNA@counts[, real.cells]
n_real.cells <- length(real.cells)
n_doublets <- round(n_real.cells/(1 - pN) - n_real.cells)
print(paste("Creating",n_doublets,"artificial doublets...",sep=" "))
real.cells1 <- sample(real.cells, n_doublets, replace = TRUE)
real.cells2 <- sample(real.cells, n_doublets, replace = TRUE)
doublets <- (data[, real.cells1] + data[, real.cells2])/2
colnames(doublets) <- paste("X", 1:n_doublets, sep = "")
data_wdoublets <- cbind(data, doublets)
## Store important pre-processing information
orig.commands <- seu@commands
## Pre-process Seurat object
if (sct == FALSE) {
print("Creating Seurat object...")
seu_wdoublets <- CreateSeuratObject(counts = data_wdoublets)
print("Normalizing Seurat object...")
seu_wdoublets <- NormalizeData(seu_wdoublets,
normalization.method = orig.commands$NormalizeData.RNA@params$normalization.method,
scale.factor = orig.commands$NormalizeData.RNA@params$scale.factor,
margin = orig.commands$NormalizeData.RNA@params$margin)
print("Finding variable genes...")
seu_wdoublets <- FindVariableFeatures(seu_wdoublets,
selection.method = orig.commands$FindVariableFeatures.RNA$selection.method,
loess.span = orig.commands$FindVariableFeatures.RNA$loess.span,
clip.max = orig.commands$FindVariableFeatures.RNA$clip.max,
mean.function = orig.commands$FindVariableFeatures.RNA$mean.function,
dispersion.function = orig.commands$FindVariableFeatures.RNA$dispersion.function,
num.bin = orig.commands$FindVariableFeatures.RNA$num.bin,
binning.method = orig.commands$FindVariableFeatures.RNA$binning.method,
nfeatures = orig.commands$FindVariableFeatures.RNA$nfeatures,
mean.cutoff = orig.commands$FindVariableFeatures.RNA$mean.cutoff,
dispersion.cutoff = orig.commands$FindVariableFeatures.RNA$dispersion.cutoff)
print("Scaling data...")
seu_wdoublets <- ScaleData(seu_wdoublets,
features = orig.commands$ScaleData.RNA$features,
model.use = orig.commands$ScaleData.RNA$model.use,
do.scale = orig.commands$ScaleData.RNA$do.scale,
do.center = orig.commands$ScaleData.RNA$do.center,
scale.max = orig.commands$ScaleData.RNA$scale.max,
block.size = orig.commands$ScaleData.RNA$block.size,
min.cells.to.block = orig.commands$ScaleData.RNA$min.cells.to.block)
print("Running PCA...")
seu_wdoublets <- RunPCA(seu_wdoublets,
features = orig.commands$ScaleData.RNA$features,
npcs = length(PCs),
rev.pca = orig.commands$RunPCA.RNA$rev.pca,
weight.by.var = orig.commands$RunPCA.RNA$weight.by.var,
verbose=FALSE)
pca.coord <- seu_wdoublets@reductions$pca@cell.embeddings[ , PCs]
cell.names <- rownames(seu_wdoublets@meta.data)
nCells <- length(cell.names)
rm(seu_wdoublets); gc() # Free up memory
}
if (sct == TRUE) {
require(sctransform)
print("Creating Seurat object...")
seu_wdoublets <- CreateSeuratObject(counts = data_wdoublets)
print("Running SCTransform...")
seu_wdoublets <- SCTransform(seu_wdoublets)
print("Running PCA...")
seu_wdoublets <- RunPCA(seu_wdoublets, npcs = length(PCs))
pca.coord <- seu_wdoublets@reductions$pca@cell.embeddings[ , PCs]
cell.names <- rownames(seu_wdoublets@meta.data)
nCells <- length(cell.names)
rm(seu_wdoublets); gc()
}
## Compute PC distance matrix
print("Calculating PC distance matrix...")
dist.mat <- fields::rdist(pca.coord)
## Compute pANN
print("Computing pANN...")
pANN <- as.data.frame(matrix(0L, nrow = n_real.cells, ncol = 1))
rownames(pANN) <- real.cells
colnames(pANN) <- "pANN"
k <- round(nCells * pK)
for (i in 1:n_real.cells) {
neighbors <- order(dist.mat[, i])
neighbors <- neighbors[2:(k + 1)]
neighbor.names <- rownames(dist.mat)[neighbors]
pANN$pANN[i] <- length(which(neighbors > n_real.cells))/k
}
print("Classifying doublets..")
classifications <- rep("Singlet",n_real.cells)
classifications[order(pANN$pANN[1:n_real.cells], decreasing=TRUE)[1:nExp]] <- "Doublet"
seu@meta.data[, paste("pANN",pN,pK,nExp,sep="_")] <- pANN[rownames(seu@meta.data), 1]
seu@meta.data[, paste("DF.classifications",pN,pK,nExp,sep="_")] <- classifications
return(seu)
}
}
## usage: https://rdrr.io/github/chris-mcginnis-ucsf/DoubletFinder/man/doubletFinder_v3.html
## source: https://github.com/chris-mcginnis-ucsf/DoubletFinder/blob/master/R/doubletFinder_v3.R
Run DoubletFinder
# the tutorial recommends using this as an approximation:
#nExp_poi <- round(0.15*nrow(pb_sex@meta.data))
#but a more appropriate approximation is that the expected number of doublets is ~1% per 1000 cells so:
nExp_poi <- round((0.01*(nrow(pb_sex@meta.data)/1000))*nrow(pb_sex@meta.data))
#run doublet finder:
pb_sex <- doubletFinder_v3(pb_sex, PCs = 1:21, pN = 0.25, pK = 0.01, nExp = nExp_poi, reuse.pANN = FALSE, sct = FALSE)
[1] "Creating 2210 artificial doublets..."
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|============== | 17%
|
|============================ | 33%
|
|========================================== | 50%
|
|========================================================= | 67%
|
|======================================================================= | 83%
|
|=====================================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Computing pANN..."
[1] "Classifying doublets.."
results in:
table(pb_sex@meta.data$DF.classifications_0.25_0.01_440)
Doublet Singlet
440 6191
visualise where doublets are:
doublet.cells <- c(rownames(pb_sex@meta.data[pb_sex@meta.data$DF.classifications_0.25_0.01_440 == "Doublet",]))
d1 <- DimPlot(pb_sex, reduction = "umap", cells.highlight = doublet.cells, sizes.highlight = 2)
#TSNEPlot(object = pb_sex, cells.highlight = doublet.cells, do.return = TRUE, )
doublet1 <- d1 + coord_fixed() + theme(axis.text.x=element_blank())
## plot clusters
cluster_plot <- DimPlot(pb_sex, reduction = "umap", group.by = "ident", label = TRUE)
doublet1 + cluster_plot
VlnPlot(object = pb_sex, features = "pANN_0.25_0.01_440", pt.size = 0.1)
## extract meta data cols of interest
df <- pb_sex@meta.data[,c("RNA_snn_res.1","DF.classifications_0.25_0.01_440")]
## make a table from doublets
df <- data.frame(rbind(table(df)))
## ammend rownames
df$cluster <- rownames(df)
## calculate percentages of cells that are doublets
df$pc_doublet <- ((df[,1])/((df[,1]) + df[,2]))*100
## inspect
#kable(df[order(df$pc_doublet),])
## plot
ggplot(data=df, aes(x=cluster, y=pc_doublet)) +
geom_col(fill="steelblue") +
theme_minimal()
It definitely seems like there are some biases in doublet detection. Fewer doublets in very early rings and in mature sexes may be due to a smaller number of the population being these cells.
It may also be biological, that these cells are less likely to associate to one another (although less likely, as doublets are a result of the probability of two cells being captured inside the same droplet together at a certain loading concentration, rather than two cells already being together upon droplet capture).
remove doublets:
## make a list of singlet cells
keep_singlets <- rownames(pb_sex@meta.data[pb_sex@meta.data$DF.classifications_0.25_0.01_440 == "Singlet",])
## subset into new seurat object
pb_sex_filtered <- subset(pb_sex, cells = keep_singlets, subset.raw = TRUE)
The following arguments are not used: subset.raw
## compare old and new objects
pb_sex
An object of class Seurat
5098 features across 6631 samples within 1 assay
Active assay: RNA (5098 features, 2000 variable features)
2 dimensional reductions calculated: pca, umap
pb_sex_filtered
An object of class Seurat
5098 features across 6191 samples within 1 assay
Active assay: RNA (5098 features, 2000 variable features)
2 dimensional reductions calculated: pca, umap
Add in bulk data predictions
#Pb Prediction correlations with bulk data (asexual hoo):
#Load in required package:
library(Hmisc)
#Cooerce expression data into a matrix and load in the reference timecourse data:
x10 <- as.matrix(pb_sex_filtered@assays$RNA@data)
rownames(x10) <- gsub("-", "_", rownames(x10))
#read in bulk data:
hoo<-as.matrix(read.table("../data/Reference/hoo_berg2.txt",header=T, row.names=1))
#Make a blank dataframe in which to add prediction:
df <- data.frame(matrix(ncol = 4, nrow = 0))
colnames(df) <- c("Prediction(Spearman)","r(Spearman)","Prediction(Pearsons)","r(Pearsons)")
#Do correlations with bulk data using both Spearman and Pearson (and the top 1000 genes):
for (i in 1:ncol(x10))
{
shared<-intersect(row.names(as.matrix(head(sort(x10[,i], decreasing=TRUE),1000))),row.names(hoo))
step0<-rcorr(x10[shared,i],hoo[shared,1:12],type = "spearman")
step1<-as.matrix(t(step0$r[2:13,1]))
step2<-rcorr(x10[shared,i],hoo[shared,1:12],type = "pearson")
step3<-as.matrix(t(step2$r[2:13,1]))
step4<-cbind(colnames(step1)[which.max(step1)],step1[which.max(step1)],colnames(step3)[which.max(step3)],step3[which.max(step3)])
colnames(step4) <- c("Prediction(Spearman)","r(Spearman)","Prediction(Pearsons)","r(Pearsons)")
rownames(step4)<-colnames(x10)[i]
df<-rbind(df,step4)
}
#Write out data into a csv file:
#write.csv(dfringr,file="/Users/ar19/Desktop/PhD/AR04_GCSKO_project/All_mutants_Feb_2018/predictionpbcombined.csv")
#Change the format of the output to make it more readable:
#gsub("Pb_","", dfringr[,1]) - Make predictions into 18hr.dat format:
#spearman:
df[,1] <- gsub("Pb_","", df[,1])
#Remove hr.dat from list:
df[,1] <- gsub("hr.dat","", df[,1])
#Check - dfringr[,1]
#Make into a number:
df[,1] <- as.numeric(df[,1])
df[,2] <- as.numeric(as.character(df[,2]))
#pearson:
df[,3] <- gsub("Pb_","", df[,3])
#Remove hr.dat from list:
df[,3] <- gsub("hr.dat","", df[,3])
#Check - dfringr[,1]
#Make into a number:
df[,3] <- as.numeric(df[,3])
df[,4] <- as.numeric(as.character(df[,4]))
#add to 10X object:
pb_sex_filtered <- AddMetaData(pb_sex_filtered, metadata = df)
Invalid name supplied, making object name syntactically valid. New object name is Prediction.Spearman.r.Spearman.Prediction.Pearsons.r.Pearsons.; see ?make.names for more details on syntax validity
Can also do with Kasia’s timecourse data:
kas<-as.matrix(read.table("../data/Reference/AP2OETC.txt",header=T, row.names=1))
#Make a blank dataframe in which to add prediction:
dfs <- data.frame(matrix(ncol = 4, nrow = 0))
colnames(dfs) <- c("ID","Prediction","r (Pearson)")
#Do correlations with bulk data using both Spearman and Pearson (and the top 1000 genes):
for (i in 1:ncol(x10))
{
shared<-intersect(row.names(as.matrix(head(sort(x10[,i], decreasing=TRUE),1000))),rownames(kas))
step0<-rcorr(x10[shared,i],kas[shared,1:10],type = "spearman")
step1<-as.matrix(t(step0$r[2:11,1]))
step2<-rcorr(x10[shared,i],kas[shared,1:10],type = "pearson")
step3<-as.matrix(t(step2$r[2:11,1]))
step4<-cbind(colnames(step1)[which.max(step1)],step1[which.max(step1)],colnames(step3)[which.max(step3)],step3[which.max(step3)])
colnames(step4) <- c("Prediction(Spearman)","r(Spearman)","Prediction(Pearsons)","r(Pearsons)")
rownames(step4)<-colnames(x10)[i]
dfs<-rbind(dfs,step4)
}
#Write out data into a csv file:
#write.csv(df,file="/Users/ar19/Desktop/PhD/AR04_GCSKO_project/All_mutants_Feb_2018/predictionkasiacombined.csv")
#Change the format of the output to make it more readable:
#gsub("Pb_","", dfs[,1]) - Make predictions into 18hr.dat format:
dfs[,1] <- gsub("X","", dfs[,1])
#Make into a number:
dfs[,1] <- as.numeric(dfs[,1])
#Make into a number:
dfs[,2] <- as.numeric(as.character(dfs[,2]))
#gsub("Pb_","", dfs[,1]) - Make predictions into 18hr.dat format:
dfs[,3] <- gsub("X","", dfs[,3])
#Make into a number:
dfs[,3] <- as.numeric(dfs[,3])
#dfs[,1]
#Make into a number:
dfs[,4] <- as.numeric(as.character(dfs[,4]))
colnames(dfs) <- c('Prediction(Spearman)_Kasia', 'r(Spearman)_Kasia', 'Prediction(Pearson)_Kasia', 'r(Pearson)_Kasia')
#add to Seurat:
#add to 10X object:
pb_sex_filtered <- AddMetaData(pb_sex_filtered, dfs)
Invalid name supplied, making object name syntactically valid. New object name is Prediction.Spearman._Kasiar.Spearman._KasiaPrediction.Pearson._Kasiar.Pearson._Kasia; see ?make.names for more details on syntax validity
Confirm life cycle designations:
## plot
FeaturePlot(pb_sex_filtered, features = c("Prediction.Spearman._Kasia", "Prediction.Spearman."))
optimse UMAP
## PCA calc
pb_sex_filtered <- RunPCA(pb_sex_filtered, features = VariableFeatures(object = pb_sex_filtered))
The following 5 features requested have zero variance (running reduction without them): PBANKA-0612861, PBANKA-0836921, PBANKA-1000041, PBANKA-API0290, PBANKA-API0031PC_ 1
Positive: PBANKA-1235600, PBANKA-0931300, PBANKA-1214300, PBANKA-1400400, PBANKA-1101100, PBANKA-0205000, PBANKA-0311800, PBANKA-0823400, PBANKA-1242300, PBANKA-1246600
PBANKA-0722600, PBANKA-1008500, PBANKA-0932400, PBANKA-1308600, PBANKA-1314800, PBANKA-1309900, PBANKA-1300096, PBANKA-1145400, PBANKA-1200096, PBANKA-1100441
PBANKA-0722801, PBANKA-1101300, PBANKA-1326400, PBANKA-1040521, PBANKA-0941300, PBANKA-1127000, PBANKA-1303800, PBANKA-0406500, PBANKA-1210200, PBANKA-0814200
Negative: PBANKA-1312700, PBANKA-1224900, PBANKA-0824400, PBANKA-1115200, PBANKA-1453000, PBANKA-1432200, PBANKA-1204200, PBANKA-0703500, PBANKA-0702000, PBANKA-0812600
PBANKA-0312700, PBANKA-1128800, PBANKA-1218300, PBANKA-1232600, PBANKA-1432400, PBANKA-1430600, PBANKA-1449300, PBANKA-1106500, PBANKA-0209300, PBANKA-0620400
PBANKA-1421500, PBANKA-1336200, PBANKA-1320800, PBANKA-0417600, PBANKA-1038800, PBANKA-1129800, PBANKA-1414500, PBANKA-1451100, PBANKA-1143800, PBANKA-0402700
PC_ 2
Positive: PBANKA-1431500, PBANKA-1431400, PBANKA-0942400, PBANKA-1340400, PBANKA-1449000, PBANKA-1038700, PBANKA-0942300, PBANKA-0934700, PBANKA-1208800, PBANKA-0828900
PBANKA-0911000, PBANKA-0306100, PBANKA-0927700, PBANKA-0806800, PBANKA-1409600, PBANKA-0406200, PBANKA-0411300, PBANKA-0415800, PBANKA-0507800, PBANKA-0416100
PBANKA-1033700, PBANKA-1361500, PBANKA-1108800, PBANKA-0920700, PBANKA-0510600, PBANKA-0925400, PBANKA-0720100, PBANKA-0507300, PBANKA-1119800, PBANKA-1030400
Negative: PBANKA-1317200, PBANKA-1352500, PBANKA-0827400, PBANKA-0810700, PBANKA-1225500, PBANKA-1319500, PBANKA-0402500, PBANKA-1419300, PBANKA-0714000, PBANKA-1315300
PBANKA-1436300, PBANKA-0105100, PBANKA-1311400, PBANKA-0821400, PBANKA-0417600, PBANKA-0204500, PBANKA-1342300, PBANKA-0417200, PBANKA-0605800, PBANKA-0704900
PBANKA-0517600, PBANKA-1231300, PBANKA-1329900, PBANKA-1415200, PBANKA-1115000, PBANKA-1134900, PBANKA-1035200, PBANKA-1233600, PBANKA-0907100, PBANKA-0912500
PC_ 3
Positive: PBANKA-1400600, PBANKA-1459300, PBANKA-0416000, PBANKA-0305100, PBANKA-0304800, PBANKA-1443300, PBANKA-1365200, PBANKA-0830200, PBANKA-0938400, PBANKA-1349000
PBANKA-0827200, PBANKA-1137800, PBANKA-1437300, PBANKA-1113300, PBANKA-0932200, PBANKA-0506100, PBANKA-1017100, PBANKA-0313800, PBANKA-0408500, PBANKA-1117000
PBANKA-0409800, PBANKA-0922500, PBANKA-0519400, PBANKA-0925600, PBANKA-0316600, PBANKA-0915200, PBANKA-0722921, PBANKA-0934300, PBANKA-0941800, PBANKA-1225000
Negative: PBANKA-1431500, PBANKA-1431400, PBANKA-1449000, PBANKA-0806800, PBANKA-1108000, PBANKA-0927700, PBANKA-0911000, PBANKA-0828900, PBANKA-0601200, PBANKA-1208800
PBANKA-1409600, PBANKA-0830900, PBANKA-0521200, PBANKA-1029400, PBANKA-0102700, PBANKA-0942400, PBANKA-1304500, PBANKA-1316500, PBANKA-1038700, PBANKA-1333100
PBANKA-0902400, PBANKA-1109600, PBANKA-1129600, PBANKA-1119800, PBANKA-0822900, PBANKA-0719700, PBANKA-1429100, PBANKA-1038200, PBANKA-0301800, PBANKA-1320100
PC_ 4
Positive: PBANKA-0519000, PBANKA-1349100, PBANKA-0519100, PBANKA-0519300, PBANKA-1344400, PBANKA-0831000, PBANKA-1002400, PBANKA-0713100, PBANKA-0523700, PBANKA-0111000
PBANKA-1032100, PBANKA-1014500, PBANKA-0519400, PBANKA-0932000, PBANKA-0804500, PBANKA-1349000, PBANKA-1315700, PBANKA-0501400, PBANKA-1101400, PBANKA-1425900
PBANKA-0519200, PBANKA-1210600, PBANKA-0509600, PBANKA-0619700, PBANKA-1327100, PBANKA-0523800, PBANKA-0307500, PBANKA-1035400, PBANKA-1119600, PBANKA-1117200
Negative: PBANKA-0107300, PBANKA-0814200, PBANKA-1214300, PBANKA-0713300, PBANKA-0604300, PBANKA-0405200, PBANKA-0932200, PBANKA-0109100, PBANKA-0823400, PBANKA-0938400
PBANKA-0416500, PBANKA-1460400, PBANKA-1437300, PBANKA-1302800, PBANKA-1130500, PBANKA-0406500, PBANKA-1448000, PBANKA-0211500, PBANKA-1444100, PBANKA-1450300
PBANKA-0710100, PBANKA-1223100, PBANKA-1419100, PBANKA-0313800, PBANKA-0818900, PBANKA-0919600, PBANKA-1235600, PBANKA-1242300, PBANKA-0916200, PBANKA-1203700
PC_ 5
Positive: PBANKA-0501600, PBANKA-1240600, PBANKA-0707700, PBANKA-0112200, PBANKA-1002600, PBANKA-0208900, PBANKA-0812200, PBANKA-1423000, PBANKA-0832800, PBANKA-1409200
PBANKA-0307500, PBANKA-0716900, PBANKA-0100400, PBANKA-1002400, PBANKA-0915000, PBANKA-1364400, PBANKA-1119600, PBANKA-1347000, PBANKA-0819600, PBANKA-1319000
PBANKA-1212500, PBANKA-0900900, PBANKA-1202000, PBANKA-1400091, PBANKA-1228800, PBANKA-0911700, PBANKA-0311500, PBANKA-1345900, PBANKA-1025300, PBANKA-0519200
Negative: PBANKA-1443300, PBANKA-0925600, PBANKA-0306700, PBANKA-0907200, PBANKA-1349000, PBANKA-1106500, PBANKA-1217400, PBANKA-1107600, PBANKA-1319300, PBANKA-0304800
PBANKA-0519400, PBANKA-0312500, PBANKA-1313100, PBANKA-0305100, PBANKA-0112100, PBANKA-0611700, PBANKA-1241800, PBANKA-0304400, PBANKA-0833300, PBANKA-0208000
PBANKA-1113300, PBANKA-1339300, PBANKA-1438000, PBANKA-0206300, PBANKA-1340500, PBANKA-0207500, PBANKA-0915200, PBANKA-0808000, PBANKA-1020100, PBANKA-1360000
## elbow plot
ElbowPlot(pb_sex_filtered, ndims = 30, reduction = "pca")
## UMAP calc
pb_sex_filtered <- RunUMAP(pb_sex_filtered, dims = 1:8, seed.use = 300, n.neighbors = 60, min.dist = 0.5, repulsion.strength = 0.05, local.connectivity = 20)
16:11:27 UMAP embedding parameters a = 0.583 b = 1.334
16:11:27 Read 6191 rows and found 8 numeric columns
16:11:27 Using Annoy for neighbor search, n_neighbors = 60
16:11:27 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:11:29 Writing NN index file to temp file /var/folders/wj/rztzclxn1t10cl2sk0plbf3r0000gn/T//RtmpyuC0hm/fileef9560289fdd
16:11:30 Searching Annoy index using 1 thread, search_k = 6000
16:11:36 Annoy recall = 100%
16:11:39 Commencing smooth kNN distance calibration using 1 thread
16:11:39 6191 smooth knn distance failures
16:11:43 Initializing from normalized Laplacian + noise
16:11:45 Commencing optimization for 500 epochs, with 184004 positive edges
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:12:22 Optimization finished
## UMAP plot
DimPlot(pb_sex_filtered, reduction = "umap", group.by = "ident", label = TRUE)
Save environment
## This saves everything in the global environment for easy recall later
#save.image(file = "GCSKO_10X_QC.RData")
#load(file = "GCSKO_10X_QC.RData")
Save object(s)
## save Rdata
#save(pb_30k_sex_filtered, pb_sex_filtered, file = "Part_2_input.Rdata")
#load(file = "Part_2_input.Rdata")
## save RDS
saveRDS(pb_sex_filtered, file = "../data_to_export/pb_sex_filtered.RDS", compress = FALSE)
#pb_sex_filtered <- readRDS("pb_sex_filtered.RDS")
## Save Robj
#save(pb_sex_filtered,file="pb_sex_filtered.Robj")
R version 4.0.2 (2020-06-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.6
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
locale:
[1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils datasets methods base
other attached packages:
[1] viridis_0.5.1 viridisLite_0.3.0 KernSmooth_2.23-17 fields_10.3
[5] maps_3.3.0 spam_2.5-1 dotCall64_1.0-0 ggridges_0.5.2
[9] patchwork_1.0.1 ggpubr_0.4.0 scales_1.1.1 mixtools_1.2.0
[13] dplyr_1.0.0 Hmisc_4.4-0 ggplot2_3.3.2 Formula_1.2-3
[17] survival_3.2-3 lattice_0.20-41 gridExtra_2.3 cowplot_1.0.0
[21] Seurat_3.2.0 knitr_1.29
loaded via a namespace (and not attached):
[1] readxl_1.3.1 backports_1.1.8 plyr_1.8.6 igraph_1.2.5
[5] lazyeval_0.2.2 splines_4.0.2 listenv_0.8.0 usethis_1.6.1
[9] digest_0.6.25 htmltools_0.5.0 fansi_0.4.1 magrittr_1.5
[13] checkmate_2.0.0 memoise_1.1.0 tensor_1.5 cluster_2.1.0
[17] ROCR_1.0-11 openxlsx_4.1.5 remotes_2.1.1 globals_0.12.5
[21] prettyunits_1.1.1 jpeg_0.1-8.1 colorspace_1.4-1 ggrepel_0.8.2
[25] haven_2.3.1 xfun_0.15 callr_3.4.3 crayon_1.3.4
[29] jsonlite_1.7.0 spatstat_1.64-1 spatstat.data_1.4-3 zoo_1.8-8
[33] ape_5.4 glue_1.4.1 polyclip_1.10-0 gtable_0.3.0
[37] leiden_0.3.3 car_3.0-8 pkgbuild_1.1.0 kernlab_0.9-29
[41] future.apply_1.6.0 abind_1.4-5 rstatix_0.6.0 miniUI_0.1.1.1
[45] Rcpp_1.0.5 xtable_1.8-4 htmlTable_2.0.1 reticulate_1.16
[49] foreign_0.8-80 rsvd_1.0.3 htmlwidgets_1.5.1 httr_1.4.2
[53] RColorBrewer_1.1-2 acepack_1.4.1 ellipsis_0.3.1 ica_1.0-2
[57] farver_2.0.3 pkgconfig_2.0.3 nnet_7.3-14 uwot_0.1.8
[61] deldir_0.1-28 labeling_0.3 tidyselect_1.1.0 rlang_0.4.7
[65] reshape2_1.4.4 later_1.1.0.1 cellranger_1.1.0 munsell_0.5.0
[69] tools_4.0.2 cli_2.0.2 generics_0.0.2 broom_0.7.0
[73] devtools_2.3.0 evaluate_0.14 stringr_1.4.0 fastmap_1.0.1
[77] yaml_2.2.1 goftest_1.2-2 processx_3.4.3 fs_1.4.2
[81] fitdistrplus_1.1-1 zip_2.1.0 purrr_0.3.4 RANN_2.6.1
[85] pbapply_1.4-2 future_1.18.0 nlme_3.1-148 mime_0.9
[89] compiler_4.0.2 rstudioapi_0.11 curl_4.3 plotly_4.9.2.1
[93] png_0.1-7 ggsignif_0.6.0 testthat_2.3.2 spatstat.utils_1.17-0
[97] tibble_3.0.3 stringi_1.4.6 highr_0.8 ps_1.3.3
[101] RSpectra_0.16-0 desc_1.2.0 forcats_0.5.0 Matrix_1.2-18
[105] vctrs_0.3.2 pillar_1.4.6 lifecycle_0.2.0 lmtest_0.9-37
[109] RcppAnnoy_0.0.16 data.table_1.12.8 irlba_2.3.3 httpuv_1.5.4
[113] R6_2.4.1 latticeExtra_0.6-29 promises_1.1.1 rio_0.5.16
[117] sessioninfo_1.1.1 codetools_0.2-16 MASS_7.3-51.6 assertthat_0.2.1
[121] pkgload_1.1.0 rprojroot_1.3-2 withr_2.2.0 sctransform_0.2.1
[125] hms_0.5.3 mgcv_1.8-31 parallel_4.0.2 rpart_4.1-15
[129] tidyr_1.1.0 rmarkdown_2.3 carData_3.0-4 segmented_1.2-0
[133] Rtsne_0.15 shiny_1.5.0 base64enc_0.1-3 tinytex_0.25